/* compile-time settings:

 FORTRAN naming convention
 default      cpgs_setup, etc.
 -DUPCASE     CPGS_SETUP, etc.
 -DUNDERSCORE cpgs_setup_, etc.

 -DMPI             parallel version (sequential otherwise)
 -DCRYSTAL_STATIC  avoid some message exchange at the risk of
 crashing b/c of insufficient buffer size

 -DINITIAL_BUFFER_SIZE=expression
 ignored unless CRYSTAL_STATIC is defined.
 arithmetic expression controlling the initial buffer size for the crystal
 router; this needs to be large enough to hold the intermediate messages
 during all stages of the crystal router

 variables that can be used in expression include
 num   - the number of processors
 n     - the length of the global index array

 */

/* default for INITIAL_BUFFER_SIZE */
#ifdef CRYSTAL_STATIC
#ifndef INITIAL_BUFFER_SIZE
#define INITIAL_BUFFER_SIZE 2 * ( 3 * num + n * 9 )
#endif
#endif

/* FORTRAN usage:

 integer hc, np
 call crystal_new(hc,comm,np)  ! get a crystal router handle (see fcrystal.c)

 integer hgs
 integer n, max_vec_dim
 integer*? global_index_array(1:n) ! type corresponding to slong in "types.h"

 call cpgs_setup(hgs,hc,global_index_array,n,max_vec_dim)
 sets hgs to new handle

 !ok to call crystal_done(hc) here, or any later time

 call cpgs_op(hgs, u, op)
 integer handle, op : 1-add, 2-multiply, 3-min, 4-max
 real    u(1:n) - same layout as global_index_array provided to cpgs_setup

 call cpgs_op_vec(hgs, u, d, op)
 integer op : 1-add, 2-multiply, 3-min, 4-max
 integer d    <= max_vec_dim
 real    u(1:d, 1:n) - vector components for each node stored together

 call cpgs_op_many(hgs, u1, u2, u3, u4, u5, u6, d, op)
 integer op : 1-add, 2-multiply, 3-min, 4-max
 integer d : in {1,2,3,4,5,6}, <= max_vec_dim
 real    u1(1:n), u2(1:n), u3(1:n), etc.

 same effect as: call cpgs_op(hgs, u1, op)
 if(d.gt.1) call cpgs_op(hgs, u2, op)
 if(d.gt.2) call cpgs_op(hgs, u3, op)
 etc.
 with possibly some savings as fewer messages are exchanged

 call cpgs_free(hgs)
 */

#include <cstdio>
#include <cstdlib>
#include <cstdarg>
#include <cstring>
#include <cmath>
#include "moab/gs.hpp"
#ifdef MOAB_HAVE_MPI
#include "moab_mpi.h"
#endif

#ifdef MOAB_HAVE_MPI

#ifdef MOAB_HAVE_VALGRIND
#include <valgrind/memcheck.h>
#elif !defined( VALGRIND_CHECK_MEM_IS_DEFINED )
#define VALGRIND_CHECK_MEM_IS_DEFINED( a, b ) ( (void)0 )
#endif

#endif

namespace moab
{

/*--------------------------------------------------------------------------
 Local Execution Phases
 --------------------------------------------------------------------------*/

#define DO_SET( a, b ) b = a
#define DO_ADD( a, b ) a += b
#define DO_MUL( a, b ) a *= b
#define DO_MIN( a, b ) \
    if( ( b ) < ( a ) ) ( a ) = b
#define DO_MAX( a, b ) \
    if( ( b ) > ( a ) ) ( a ) = b
#define DO_BPR( a, b )         \
    do                         \
    {                          \
        uint a_ = a;           \
        uint b_ = b;           \
        for( ;; )              \
        {                      \
            if( a_ < b_ )      \
                b_ >>= 1;      \
            else if( b_ < a_ ) \
                a_ >>= 1;      \
            else               \
                break;         \
        }                      \
        ( a ) = a_;            \
    } while( 0 )

#define LOOP( op )                       \
    do                                   \
    {                                    \
        sint i, j;                       \
        while( ( i = *cm++ ) != -1 )     \
            while( ( j = *cm++ ) != -1 ) \
                op( u[i], u[j] );        \
    } while( 0 )

static void local_condense( realType* u, int op, const sint* cm )
{
    switch( op )
    {
        case GS_OP_ADD:
            LOOP( DO_ADD );
            break;
        case GS_OP_MUL:
            LOOP( DO_MUL );
            break;
        case GS_OP_MIN:
            LOOP( DO_MIN );
            break;
        case GS_OP_MAX:
            LOOP( DO_MAX );
            break;
        case GS_OP_BPR:
            LOOP( DO_BPR );
            break;
    }
}

static void local_uncondense( realType* u, const sint* cm )
{
    LOOP( DO_SET );
}

#undef LOOP

#define LOOP( op )                        \
    do                                    \
    {                                     \
        sint i, j, k;                     \
        while( ( i = *cm++ ) != -1 )      \
        {                                 \
            realType* pi = u + n * i;     \
            while( ( j = *cm++ ) != -1 )  \
            {                             \
                realType* pj = u + n * j; \
                for( k = n; k; --k )      \
                {                         \
                    op( *pi, *pj );       \
                    ++pi;                 \
                    ++pj;                 \
                }                         \
            }                             \
        }                                 \
    } while( 0 )

static void local_condense_vec( realType* u, uint n, int op, const sint* cm )
{
    switch( op )
    {
        case GS_OP_ADD:
            LOOP( DO_ADD );
            break;
        case GS_OP_MUL:
            LOOP( DO_MUL );
            break;
        case GS_OP_MIN:
            LOOP( DO_MIN );
            break;
        case GS_OP_MAX:
            LOOP( DO_MAX );
            break;
        case GS_OP_BPR:
            LOOP( DO_BPR );
            break;
    }
}

static void local_uncondense_vec( realType* u, uint n, const sint* cm )
{
    LOOP( DO_SET );
}

#undef LOOP

/*--------------------------------------------------------------------------
 Non-local Execution Phases
 --------------------------------------------------------------------------*/

#ifdef MOAB_HAVE_MPI

void gs_data::nonlocal_info::initialize( uint np, uint count, uint nlabels, uint nulabels, uint maxv )
{
    _target  = NULL;
    _nshared = NULL;
    _sh_ind  = NULL;
    _slabels = NULL;
    _ulabels = NULL;
    _reqs    = NULL;
    _buf     = NULL;
    _np      = np;
    _target  = (uint*)malloc( ( 2 * np + count ) * sizeof( uint ) );
    _nshared = _target + np;
    _sh_ind  = _nshared + np;
    if( 1 < nlabels )
        _slabels = (slong*)malloc( ( ( nlabels - 1 ) * count ) * sizeof( slong ) );
    else
        _slabels = NULL;
    _ulabels = (Ulong*)malloc( ( nulabels * count ) * sizeof( Ulong ) );
    _reqs    = (MPI_Request*)malloc( 2 * np * sizeof( MPI_Request ) );
    _buf     = (realType*)malloc( ( 2 * count * maxv ) * sizeof( realType ) );
    _maxv    = maxv;
}

void gs_data::nonlocal_info::nlinfo_free()
{
    // Free the ptrs
    free( _buf );
    free( _reqs );
    free( _target );
    free( _slabels );
    free( _ulabels );
    // Set to null
    _ulabels = NULL;
    _buf     = NULL;
    _reqs    = NULL;
    _target  = NULL;
    _slabels = NULL;
    _nshared = NULL;
    _sh_ind  = NULL;
}

void gs_data::nonlocal_info::nonlocal( realType* u, int op, MPI_Comm comm )
{
    MPI_Status status;
    uint np           = this->_np;
    MPI_Request* reqs = this->_reqs;
    uint* targ        = this->_target;
    uint* nshared     = this->_nshared;
    uint* sh_ind      = this->_sh_ind;
    uint id;
    realType *buf = this->_buf, *start;
    unsigned int i;
    {
        MPI_Comm_rank( comm, (int*)&i );
        id = i;
    }
    for( i = 0; i < np; ++i )
    {
        uint c = nshared[i];
        start  = buf;
        for( ; c; --c )
            *buf++ = u[*sh_ind++];
        MPI_Isend( (void*)start, nshared[i] * sizeof( realType ), MPI_UNSIGNED_CHAR, targ[i], id, comm, reqs++ );
    }
    start = buf;
    for( i = 0; i < np; ++i )
    {
        MPI_Irecv( (void*)start, nshared[i] * sizeof( realType ), MPI_UNSIGNED_CHAR, targ[i], targ[i], comm, reqs++ );
        start += nshared[i];
    }
    for( reqs = this->_reqs, i = np * 2; i; --i )
        MPI_Wait( reqs++, &status );
    sh_ind = this->_sh_ind;
#define LOOP( OP )                        \
    do                                    \
    {                                     \
        for( i = 0; i < np; ++i )         \
        {                                 \
            uint c;                       \
            for( c = nshared[i]; c; --c ) \
            {                             \
                OP( u[*sh_ind], *buf );   \
                ++sh_ind, ++buf;          \
            }                             \
        }                                 \
    } while( 0 )
    switch( op )
    {
        case GS_OP_ADD:
            LOOP( DO_ADD );
            break;
        case GS_OP_MUL:
            LOOP( DO_MUL );
            break;
        case GS_OP_MIN:
            LOOP( DO_MIN );
            break;
        case GS_OP_MAX:
            LOOP( DO_MAX );
            break;
        case GS_OP_BPR:
            LOOP( DO_BPR );
            break;
    }
#undef LOOP
}

void gs_data::nonlocal_info::nonlocal_vec( realType* u, uint n, int op, MPI_Comm comm )
{
    MPI_Status status;
    uint np           = this->_np;
    MPI_Request* reqs = this->_reqs;
    uint* targ        = this->_target;
    uint* nshared     = this->_nshared;
    uint* sh_ind      = this->_sh_ind;
    uint id;
    realType *buf = this->_buf, *start;
    uint size     = n * sizeof( realType );
    unsigned int i;
    {
        MPI_Comm_rank( comm, (int*)&i );
        id = i;
    }
    for( i = 0; i < np; ++i )
    {
        uint ns = nshared[i], c = ns;
        start = buf;
        for( ; c; --c )
        {
            memcpy( buf, u + n * ( *sh_ind++ ), size );
            buf += n;
        }
        MPI_Isend( (void*)start, ns * size, MPI_UNSIGNED_CHAR, targ[i], id, comm, reqs++ );
    }
    start = buf;
    for( i = 0; i < np; ++i )
    {
        int nsn = n * nshared[i];
        MPI_Irecv( (void*)start, nsn * size, MPI_UNSIGNED_CHAR, targ[i], targ[i], comm, reqs++ );
        start += nsn;
    }
    for( reqs = this->_reqs, i = np * 2; i; --i )
        MPI_Wait( reqs++, &status );
    sh_ind = this->_sh_ind;
#define LOOP( OP )                                    \
    do                                                \
    {                                                 \
        for( i = 0; i < np; ++i )                     \
        {                                             \
            uint c, j;                                \
            for( c = nshared[i]; c; --c )             \
            {                                         \
                realType* uu = u + n * ( *sh_ind++ ); \
                for( j = n; j; --j )                  \
                {                                     \
                    OP( *uu, *buf );                  \
                    ++uu;                             \
                    ++buf;                            \
                }                                     \
            }                                         \
        }                                             \
    } while( 0 )
    switch( op )
    {
        case GS_OP_ADD:
            LOOP( DO_ADD );
            break;
        case GS_OP_MUL:
            LOOP( DO_MUL );
            break;
        case GS_OP_MIN:
            LOOP( DO_MIN );
            break;
        case GS_OP_MAX:
            LOOP( DO_MAX );
            break;
        case GS_OP_BPR:
            LOOP( DO_BPR );
            break;
    }
#undef LOOP
}

void gs_data::nonlocal_info::nonlocal_many( realType** u, uint n, int op, MPI_Comm comm )
{
    MPI_Status status;
    uint np           = this->_np;
    MPI_Request* reqs = this->_reqs;
    uint* targ        = this->_target;
    uint* nshared     = this->_nshared;
    uint* sh_ind      = this->_sh_ind;
    uint id;
    realType *buf = this->_buf, *start;
    unsigned int i;
    {
        MPI_Comm_rank( comm, (int*)&i );
        id = i;
    }
    for( i = 0; i < np; ++i )
    {
        uint c, j, ns = nshared[i];
        start = buf;
        for( j = 0; j < n; ++j )
        {
            realType* uu = u[j];
            for( c = 0; c < ns; ++c )
                *buf++ = uu[sh_ind[c]];
        }
        sh_ind += ns;
        MPI_Isend( (void*)start, n * ns * sizeof( realType ), MPI_UNSIGNED_CHAR, targ[i], id, comm, reqs++ );
    }
    start = buf;
    for( i = 0; i < np; ++i )
    {
        int nsn = n * nshared[i];
        MPI_Irecv( (void*)start, nsn * sizeof( realType ), MPI_UNSIGNED_CHAR, targ[i], targ[i], comm, reqs++ );
        start += nsn;
    }
    for( reqs = this->_reqs, i = np * 2; i; --i )
        MPI_Wait( reqs++, &status );
    sh_ind = this->_sh_ind;
#define LOOP( OP )                             \
    do                                         \
    {                                          \
        for( i = 0; i < np; ++i )              \
        {                                      \
            uint c, j, ns = nshared[i];        \
            for( j = 0; j < n; ++j )           \
            {                                  \
                realType* uu = u[j];           \
                for( c = 0; c < ns; ++c )      \
                {                              \
                    OP( uu[sh_ind[c]], *buf ); \
                    ++buf;                     \
                }                              \
            }                                  \
            sh_ind += ns;                      \
        }                                      \
    } while( 0 )
    switch( op )
    {
        case GS_OP_ADD:
            LOOP( DO_ADD );
            break;
        case GS_OP_MUL:
            LOOP( DO_MUL );
            break;
        case GS_OP_MIN:
            LOOP( DO_MIN );
            break;
        case GS_OP_MAX:
            LOOP( DO_MAX );
            break;
        case GS_OP_BPR:
            LOOP( DO_BPR );
            break;
    }
#undef LOOP
}

/*---------------------------------------------------------------------------
 MOAB Crystal Router
 ---------------------------------------------------------------------------*/
gs_data::crystal_data::crystal_data() {}

void gs_data::crystal_data::initialize( MPI_Comm comm )
{
    int num, id;
    buffers[0].buf.buffer_init( 1024 );
    buffers[1].buf.buffer_init( 1024 );
    buffers[2].buf.buffer_init( 1024 );
    all  = &buffers[0];
    keep = &buffers[1];
    send = &buffers[2];
    memcpy( &( this->_comm ), &comm, sizeof( MPI_Comm ) );
    MPI_Comm_rank( comm, &id );
    this->_id = id;
    MPI_Comm_size( comm, &num );
    this->_num = num;
}

void gs_data::crystal_data::reset()
{
    buffers[0].buf.reset();
    buffers[1].buf.reset();
    buffers[2].buf.reset();
    keep = NULL;
    all  = NULL;
    send = NULL;
}

// Partition data before sending
void gs_data::crystal_data::partition( uint cutoff, crystal_buf* lo, crystal_buf* hi )
{
    const uint* src = (uint*)all->buf.ptr;
    const uint* end = (uint*)src + all->n;
    uint *target, *lop, *hip;
    lo->n = hi->n = 0;
    lo->buf.buffer_reserve( all->n * sizeof( uint ) );
    hi->buf.buffer_reserve( all->n * sizeof( uint ) );
    lop = (uint*)lo->buf.ptr;
    hip = (uint*)hi->buf.ptr;
    while( src != end )
    {
        uint chunk_len = 3 + src[2];
        if( src[0] < cutoff )
        {
            target = lop;
            lo->n += chunk_len;
            lop += chunk_len;
        }
        else
        {
            target = hip;
            hi->n += chunk_len;
            hip += chunk_len;
        }
        memcpy( target, src, chunk_len * sizeof( uint ) );
        src += chunk_len;
    }
}

// Send message to target process
void gs_data::crystal_data::send_( uint target, int recvn )
{
    MPI_Request req[3] = { MPI_REQUEST_NULL, MPI_REQUEST_NULL, MPI_REQUEST_NULL };
    MPI_Status status[3];
    uint count[2] = { 0, 0 }, sum, *recv[2];
    crystal_buf* t;
    int i;

    (void)VALGRIND_CHECK_MEM_IS_DEFINED( &send->n, sizeof( uint ) );
    MPI_Isend( (void*)&send->n, sizeof( uint ), MPI_UNSIGNED_CHAR, target, _id, _comm, &req[0] );
    for( i = 0; i < recvn; ++i )
        MPI_Irecv( (void*)&count[i], sizeof( uint ), MPI_UNSIGNED_CHAR, target + i, target + i, _comm, &req[i + 1] );
    MPI_Waitall( recvn + 1, req, status );
    sum = keep->n;
    for( i = 0; i < recvn; ++i )
        sum += count[i];
    keep->buf.buffer_reserve( sum * sizeof( uint ) );
    recv[0] = (uint*)keep->buf.ptr;
    recv[0] += keep->n;
    recv[1] = recv[0] + count[0];
    keep->n = sum;

    (void)VALGRIND_CHECK_MEM_IS_DEFINED( send->buf.ptr, send->n * sizeof( uint ) );
    MPI_Isend( (void*)send->buf.ptr, send->n * sizeof( uint ), MPI_UNSIGNED_CHAR, target, _id, _comm, &req[0] );
    if( recvn )
    {
        MPI_Irecv( (void*)recv[0], count[0] * sizeof( uint ), MPI_UNSIGNED_CHAR, target, target, _comm, &req[1] );
        if( recvn == 2 )
            MPI_Irecv( (void*)recv[1], count[1] * sizeof( uint ), MPI_UNSIGNED_CHAR, target + 1, target + 1, _comm,
                       &req[2] );
    }
    MPI_Waitall( recvn + 1, req, status );

    t    = all;
    all  = keep;
    keep = t;
}

void gs_data::crystal_data::crystal_router()
{
    uint bl = 0, bh, n = _num, nl, target;
    int recvn;
    crystal_buf *lo, *hi;
    while( n > 1 )
    {
        nl = n / 2, bh = bl + nl;
        if( _id < bh )
        {
            target = _id + nl;
            recvn  = ( n & 1 && _id == bh - 1 ) ? 2 : 1;
            lo     = keep;
            hi     = send;
        }
        else
        {
            target = _id - nl;
            recvn  = ( target == bh ) ? ( --target, 0 ) : 1;
            hi     = keep;
            lo     = send;
        }
        partition( bh, lo, hi );
        send_( target, recvn );
        if( _id < bh )
            n = nl;
        else
        {
            n -= nl;
            bl = bh;
        }
    }
}

#define UINT_PER_X( X ) ( ( sizeof( X ) + sizeof( uint ) - 1 ) / sizeof( uint ) )
#define UINT_PER_REAL   UINT_PER_X( realType )
#define UINT_PER_LONG   UINT_PER_X( slong )
#define UINT_PER_ULONG  UINT_PER_X( Ulong )

/*-------------------------------------------------------------------------
 Transfer
 -------------------------------------------------------------------------*/
ErrorCode gs_data::crystal_data::gs_transfer( int dynamic, moab::TupleList& tl, unsigned pf )
{

    unsigned mi, ml, mul, mr;
    tl.getTupleSize( mi, ml, mul, mr );

    const unsigned tsize = ( mi - 1 ) + ml * UINT_PER_LONG + mul * UINT_PER_ULONG + mr * UINT_PER_REAL;
    sint p, lp = -1;
    sint* ri;
    slong* rl;
    Ulong* rul;
    realType* rr;
    uint i, j, *buf, *len = 0, *buf_end;

    /* sort to group by target proc */
    if( pf >= mi ) return moab::MB_MEMORY_ALLOCATION_FAILED;
    // fail("pf expected to be in vi array (%d, %d)", pf, mi);
    tl.sort( pf, &all->buf );

    /* pack into buffer for crystal router */
    all->buf.buffer_reserve( ( tl.get_n() * ( 3 + tsize ) ) * sizeof( uint ) );
    all->n = 0;
    buf    = (uint*)all->buf.ptr;

    bool canWrite = tl.get_writeEnabled();
    if( !canWrite ) tl.enableWriteAccess();

    ri  = tl.vi_wr;
    rl  = tl.vl_wr;
    rul = tl.vul_wr;
    rr  = tl.vr_wr;

    for( i = tl.get_n(); i; --i )
    {
        p = ri[pf];
        if( p != lp )
        {
            lp     = p;
            *buf++ = p;   /* target */
            *buf++ = _id; /* source */
            len    = buf++;
            *len   = 0; /* length */
            all->n += 3;
        }
        for( j = 0; j < mi; ++j, ++ri )
            if( j != pf ) *buf++ = *ri;
        for( j = ml; j; --j, ++rl )
        {
            memcpy( buf, rl, sizeof( slong ) );
            buf += UINT_PER_LONG;
        }
        for( j = mul; j; --j, ++rul )
        {
            memcpy( buf, rul, sizeof( Ulong ) );
            buf += UINT_PER_ULONG;
        }
        for( j = mr; j; --j, ++rr )
        {
            memcpy( buf, rr, sizeof( realType ) );
            buf += UINT_PER_REAL;
        }
        *len += tsize, all->n += tsize;
    }

    crystal_router();

    /* unpack */
    buf     = (uint*)all->buf.ptr;
    buf_end = buf + all->n;
    tl.set_n( 0 );
    ri  = tl.vi_wr;
    rl  = tl.vl_wr;
    rul = tl.vul_wr;
    rr  = tl.vr_wr;

    while( buf != buf_end )
    {
        sint llen;
        buf++;         /* target ( == this proc ) */
        p    = *buf++; /* source */
        llen = *buf++; /* length */
        while( llen > 0 )
        {
            if( tl.get_n() == tl.get_max() )
            {
                if( !dynamic )
                {
                    tl.set_n( tl.get_max() + 1 );
                    if( !canWrite ) tl.disableWriteAccess();
                    return moab::MB_SUCCESS;
                }
                ErrorCode rval = tl.resize( tl.get_max() + ( 1 + tl.get_max() ) / 2 + 1 );
                if( rval != moab::MB_SUCCESS )
                {
                    if( !canWrite ) tl.disableWriteAccess();
                    return rval;
                }
                ri  = tl.vi_wr + mi * tl.get_n();
                rl  = tl.vl_wr + ml * tl.get_n();
                rul = tl.vul_wr + mul * tl.get_n();
                rr  = tl.vr_wr + mr * tl.get_n();
            }
            tl.inc_n();
            for( j = 0; j < mi; ++j )
                if( j != pf )
                    *ri++ = *buf++;
                else
                    *ri++ = p;
            for( j = ml; j; --j )
            {
                memcpy( rl++, buf, sizeof( slong ) );
                buf += UINT_PER_LONG;
            }
            for( j = mul; j; --j )
            {
                memcpy( rul++, buf, sizeof( Ulong ) );
                buf += UINT_PER_ULONG;
            }
            for( j = mr; j; --j )
            {
                memcpy( rr++, buf, sizeof( realType ) );
                buf += UINT_PER_REAL;
            }
            llen -= tsize;
        }
    }

    if( !canWrite ) tl.disableWriteAccess();
    return moab::MB_SUCCESS;
}
#endif

/*--------------------------------------------------------------------------
 Combined Execution
 --------------------------------------------------------------------------*/

void gs_data::gs_data_op( realType* u, int op )
{
    local_condense( u, op, this->local_cm );
#ifdef MOAB_HAVE_MPI
    this->nlinfo->nonlocal( u, op, _comm );
#endif
    local_uncondense( u, local_cm );
}

void gs_data::gs_data_op_vec( realType* u, uint n, int op )
{
#ifdef MOAB_HAVE_MPI
    if( n > nlinfo->_maxv )
        moab::fail( "%s: initialized with max vec size = %d,"
                    " but called with vec size = %d\n",
                    __FILE__, nlinfo->_maxv, n );
#endif
    local_condense_vec( u, n, op, local_cm );
#ifdef MOAB_HAVE_MPI
    this->nlinfo->nonlocal_vec( u, n, op, _comm );
#endif
    local_uncondense_vec( u, n, local_cm );
}

void gs_data::gs_data_op_many( realType** u, uint n, int op )
{
    uint i;
#ifdef MOAB_HAVE_MPI
    if( n > nlinfo->_maxv )
        moab::fail( "%s: initialized with max vec size = %d,"
                    " but called with vec size = %d\n",
                    __FILE__, nlinfo->_maxv, n );
#endif
    for( i = 0; i < n; ++i )
        local_condense( u[i], op, local_cm );

    moab::fail( "%s: initialized with max vec size = %d,"
                " but called with vec size = %d\n",
                __FILE__, 6, n );

#ifdef MOAB_HAVE_MPI
    this->nlinfo->nonlocal_many( u, n, op, _comm );
#endif
    for( i = 0; i < n; ++i )
        local_uncondense( u[i], local_cm );
}

/*--------------------------------------------------------------------------
 Setup
 --------------------------------------------------------------------------*/

ErrorCode gs_data::initialize( uint n, const long* label, const Ulong* ulabel, uint maxv, const unsigned int nlabels,
                               const unsigned int nulabels, crystal_data* crystal )
{
    nlinfo = NULL;
    unsigned int j;
    TupleList nonzero, primary;
    ErrorCode rval;
#ifdef MOAB_HAVE_MPI
    TupleList shared;
#else
    moab::TupleList::buffer buf;
#endif
    (void)VALGRIND_CHECK_MEM_IS_DEFINED( label, nlabels * sizeof( long ) );
    (void)VALGRIND_CHECK_MEM_IS_DEFINED( ulabel, nlabels * sizeof( Ulong ) );
#ifdef MOAB_HAVE_MPI
    MPI_Comm_dup( crystal->_comm, &this->_comm );
#else
    buf.buffer_init( 1024 );
#endif

    /* construct list of nonzeros: (index ^, label) */
    nonzero.initialize( 1, nlabels, nulabels, 0, n );
    nonzero.enableWriteAccess();
    {
        uint i;
        sint* nzi;
        long* nzl;
        Ulong* nzul;
        nzi  = nonzero.vi_wr;
        nzl  = nonzero.vl_wr;
        nzul = nonzero.vul_wr;
        for( i = 0; i < n; ++i )
            if( label[nlabels * i] != 0 )
            {
                nzi[0] = i;
                for( j = 0; j < nlabels; j++ )
                    nzl[j] = label[nlabels * i + j];
                for( j = 0; j < nulabels; j++ )
                    nzul[j] = ulabel[nulabels * i + j];
                nzi++;
                nzl += nlabels;
                nzul += nulabels;
                nonzero.inc_n();
            }
    }

    /* sort nonzeros by label: (index ^2, label ^1) */
#ifndef MOAB_HAVE_MPI
    nonzero.sort( 1, &buf );
#else
    nonzero.sort( 1, &crystal->all->buf );
#endif

    /* build list of unique labels w/ lowest associated index:
     (index in nonzero ^, primary (lowest) index in label, count, label(s),
     ulabel(s)) */
    primary.initialize( 3, nlabels, nulabels, 0, nonzero.get_n() );
    primary.enableWriteAccess();
    {
        uint i;
        sint *nzi = nonzero.vi_wr, *pi = primary.vi_wr;
        slong *nzl = nonzero.vl_wr, *pl = primary.vl_wr;
        Ulong *nzul = nonzero.vul_wr, *pul = primary.vul_wr;
        slong last = -1;
        for( i = 0; i < nonzero.get_n(); ++i, nzi += 1, nzl += nlabels, nzul += nulabels )
        {
            if( nzl[0] == last )
            {
                ++pi[-1];
                continue;
            }
            last  = nzl[0];
            pi[0] = i;
            pi[1] = nzi[0];
            for( j = 0; j < nlabels; j++ )
                pl[j] = nzl[j];
            for( j = 0; j < nulabels; j++ )
                pul[j] = nzul[j];
            pi[2] = 1;
            pi += 3, pl += nlabels;
            pul += nulabels;
            primary.inc_n();
        }
    }

    /* calculate size of local condense map */
    {
        uint i, count = 1;
        sint* pi = primary.vi_wr;
        for( i = primary.get_n(); i; --i, pi += 3 )
            if( pi[2] > 1 ) count += pi[2] + 1;
        this->local_cm = (sint*)malloc( count * sizeof( sint ) );
    }

    /* sort unique labels by primary index:
     (nonzero index ^2, primary index ^1, count, label ^2) */
#ifndef MOAB_HAVE_MPI
    primary.sort( 0, &buf );
    buf.reset();
    // buffer_free(&buf);
#else
    primary.sort( 0, &crystal->all->buf );
#endif

    /* construct local condense map */
    {
        uint i, ln;
        sint* pi = primary.vi_wr;
        sint* cm = this->local_cm;
        for( i = primary.get_n(); i > 0; --i, pi += 3 )
            if( ( ln = pi[2] ) > 1 )
            {
                sint* nzi = nonzero.vi_wr + 1 * pi[0];
                for( j = ln; j > 0; --j, nzi += 1 )
                    *cm++ = nzi[0];
                *cm++ = -1;
            }
        *cm++ = -1;
    }
    nonzero.reset();
#ifndef MOAB_HAVE_MPI
    primary.reset();
#else
    /* assign work proc by label modulo np */
    {
        uint i;
        sint* pi  = primary.vi_wr;
        slong* pl = primary.vl_wr;
        for( i = primary.get_n(); i; --i, pi += 3, pl += nlabels )
            pi[0] = pl[0] % crystal->_num;
    }
    rval = crystal->gs_transfer( 1, primary, 0 ); /* transfer to work procs */
    if( rval != MB_SUCCESS ) return rval;
    /* primary: (source proc, index on src, useless, label) */
    /* sort by label */
    primary.sort( 3, &crystal->all->buf );
    /* add sentinel to primary list */
    if( primary.get_n() == primary.get_max() )
        primary.resize( ( primary.get_max() ? primary.get_max() + ( primary.get_max() + 1 ) / 2 + 1 : 2 ) );
    primary.vl_wr[nlabels * primary.get_n()] = -1;
    /* construct shared list: (proc1, proc2, index1, label) */
#ifdef MOAB_HAVE_MPI
    shared.initialize( 3, nlabels, nulabels, 0, primary.get_n() );
    shared.enableWriteAccess();
#endif
    {
        sint *pi1 = primary.vi_wr, *si = shared.vi_wr;
        slong lbl, *pl1 = primary.vl_wr, *sl = shared.vl_wr;
        Ulong *pul1 = primary.vul_wr, *sul = shared.vul_wr;
        for( ; ( lbl = pl1[0] ) != -1; pi1 += 3, pl1 += nlabels, pul1 += nulabels )
        {
            sint* pi2   = pi1 + 3;
            slong* pl2  = pl1 + nlabels;
            Ulong* pul2 = pul1 + nulabels;
            for( ; pl2[0] == lbl; pi2 += 3, pl2 += nlabels, pul2 += nulabels )
            {
                if( shared.get_n() + 2 > shared.get_max() )
                    shared.resize( ( shared.get_max() ? shared.get_max() + ( shared.get_max() + 1 ) / 2 + 1 : 2 ) ),
                        si = shared.vi_wr + shared.get_n() * 3;
                sl    = shared.vl_wr + shared.get_n() * nlabels;
                sul   = shared.vul_wr + shared.get_n() * nulabels;
                si[0] = pi1[0];
                si[1] = pi2[0];
                si[2] = pi1[1];
                for( j = 0; j < nlabels; j++ )
                    sl[j] = pl2[j];
                for( j = 0; j < nulabels; j++ )
                    sul[j] = pul2[j];
                si += 3;
                sl += nlabels;
                sul += nulabels;
                shared.inc_n();
                si[0] = pi2[0];
                si[1] = pi1[0];
                si[2] = pi2[1];
                for( j = 0; j < nlabels; j++ )
                    sl[j] = pl1[j];
                for( j = 0; j < nulabels; j++ )
                    sul[j] = pul1[j];
                si += 3;
                sl += nlabels;
                sul += nulabels;
                shared.inc_n();
            }
        }
    }
    primary.reset();
    rval = crystal->gs_transfer( 1, shared, 0 ); /* segfaulting transfer to dest procs */
    if( rval != MB_SUCCESS ) return rval;
    /* shared list: (useless, proc2, index, label) */
    /* sort by label */
    shared.sort( 3, &crystal->all->buf );
    /* sort by partner proc */
    shared.sort( 1, &crystal->all->buf );
    /* count partner procs */
    {
        uint i, count = 0;
        sint proc = -1, *si = shared.vi_wr;
        for( i = shared.get_n(); i; --i, si += 3 )
            if( si[1] != proc )
            {
                ++count;
                proc = si[1];
            }
        // this->nlinfo = new nonlocal_info();
        // this->nlinfo->initialize(count,shared.get_n(),
        //                          nlabels, nulabels, maxv);
        this->nlinfo = new nonlocal_info( count, shared.get_n(), nlabels, nulabels, maxv );
    }
    /* construct non-local info */
    {
        uint i;
        sint proc = -1, *si = shared.vi_wr;
        slong* sl      = shared.vl_wr;
        Ulong* ul      = shared.vul_wr;
        uint* target   = this->nlinfo->_target;
        uint* nshared  = this->nlinfo->_nshared;
        uint* sh_ind   = this->nlinfo->_sh_ind;
        slong* slabels = this->nlinfo->_slabels;
        Ulong* ulabels = this->nlinfo->_ulabels;
        for( i = shared.get_n(); i; --i, si += 3 )
        {
            if( si[1] != proc )
            {
                proc       = si[1];
                *target++  = proc;
                *nshared++ = 0;
            }
            ++nshared[-1];
            *sh_ind++ = si[2];
            // don't store 1st slabel
            sl++;
            for( j = 0; j < nlabels - 1; j++ )
                slabels[j] = sl[j];
            for( j = 0; j < nulabels; j++ )
                ulabels[j] = ul[j];
            slabels += nlabels - 1;
            ulabels += nulabels;
            sl += nlabels - 1;
            ul += nulabels;
        }
    }
    shared.reset();
#endif
    return MB_SUCCESS;
}

void gs_data::reset()
{
    free( local_cm );
    local_cm = NULL;
#ifdef MOAB_HAVE_MPI
    if( nlinfo != NULL )
    {
        nlinfo->nlinfo_free();
        delete this->nlinfo;
        MPI_Comm_free( &_comm );
        nlinfo = NULL;
    }
#endif
}

}  // namespace moab
